Main question to address: What areas of NYC are under-serviced by ADA accessible stations?

Plan: Somehow randomly “sample” locations in the NYC territory in longitude, latitude units and then calculate distance from that point to the nearest accessible subway station. Normalize to nearest (accessible or not accessible) station?

Need: - data for which stations are ada accessible - data for station locations (longitude, latitude) - some way to sample the territory of NYC (not including Staten Island and all the rivers)

library(tidyverse)
library(ggmap)
library(stringr)
library(cowplot)
library(viridis)
library(geosphere)

Not sure which are the most reliable datasets to use here.

ada.raw.data <- read_csv(file = "../data/Elevator Escalator Station Data.csv") %>% 
  arrange(station)
sub.location.raw.data <- read_csv(file = "../data/NYC_Transit_Subway_Entrance_And_Exit_Data.csv") %>% 
  arrange(`Station Name`)
colnames(sub.location.raw.data)[3] <- "station"
colnames(sub.location.raw.data) <- colnames(sub.location.raw.data) %>% 
  str_to_lower() %>% 
  gsub(pattern = " ", replacement = ".")
colnames(ada.raw.data) <- colnames(ada.raw.data) %>% 
  str_to_lower()

Mucking around

# What are the cols and how many missing values per col?
sub.location.raw.data %>% 
  is.na %>% 
  as.data.frame() %>% 
  sapply(sum)
##           division               line            station 
##                  0                  0                  0 
##   station.latitude  station.longitude             route1 
##                  0                  0                  0 
##             route2             route3             route4 
##                848               1374               1547 
##             route5             route6             route7 
##               1630               1741               1788 
##             route8             route9            route10 
##               1820               1840               1845 
##            route11      entrance.type              entry 
##               1845                  0                  0 
##          exit.only            vending           staffing 
##               1812                  0                  0 
##        staff.hours                ada          ada.notes 
##               1828                  0               1793 
##     free.crossover north.south.street   east.west.street 
##                  0                 29                 35 
##             corner  entrance.latitude entrance.longitude 
##                 32                  0                  0 
##   station.location  entrance.location 
##                  0                  0
ada.raw.data %>% 
  is.na %>% 
  as.data.frame() %>% 
  sapply(sum)
##       station       borough       trainno   equipmentno equipmenttype 
##             0             0             0             0             0 
##       serving           ada 
##             0             0
full.join.mess <- ada.raw.data %>% 
  full_join(sub.location.raw.data, by = "station") %>% 
  arrange(station)
inner.join.mess <- ada.raw.data %>% 
  inner_join(sub.location.raw.data, by = "station") %>% 
  arrange(station)
full.join.mess %>% 
  filter(!(station %in% unique(inner.join.mess$station)))
## # A tibble: 2,046 x 38
##     station borough trainno equipmentno equipmenttype serving ada.x
##       <chr>   <chr>   <chr>       <chr>         <chr>   <chr> <chr>
##  1 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  2 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  3 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  4 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  5 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  6 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  7 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  8 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
##  9 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
## 10 103rd St    <NA>    <NA>        <NA>          <NA>    <NA>  <NA>
## # ... with 2,036 more rows, and 31 more variables: division <chr>,
## #   line <chr>, station.latitude <dbl>, station.longitude <dbl>,
## #   route1 <chr>, route2 <chr>, route3 <chr>, route4 <chr>, route5 <chr>,
## #   route6 <chr>, route7 <chr>, route8 <int>, route9 <int>, route10 <int>,
## #   route11 <int>, entrance.type <chr>, entry <chr>, exit.only <chr>,
## #   vending <chr>, staffing <chr>, staff.hours <chr>, ada.y <lgl>,
## #   ada.notes <chr>, free.crossover <lgl>, north.south.street <chr>,
## #   east.west.street <chr>, corner <chr>, entrance.latitude <dbl>,
## #   entrance.longitude <dbl>, station.location <chr>,
## #   entrance.location <chr>
# need to work on this

The Subway Entrance and Exit data seems to be good enough for an initial analysis. Hopefully the ADA column in that dataset is reliable. Need to compare this with the other datasets collected at some point, but joining is a problem. To do: - check against Wire Monkey’s joined dataset

For now, stick with the sub.location.raw.data for the rest of the analysis.

Map of all subway stations in this dataset:

# map of all the stations
nyc.map <- get_map(location = "New York City", maptype = "roadmap")
ggmap(nyc.map) +
  geom_point(data = sub.location.raw.data, aes(x = station.longitude, y = station.latitude)) +
  ylim(40.495992, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.257159, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data")
## Warning: Removed 1 rows containing missing values (geom_rect).

# Note: no stations on Staten Island - will exclude it in the following analysis

Notes: - Most stations have more than one row (multiple entrances/exits) - need to compress info - Station names in the station column are NOT unique (multiple 103rd St, etc) in the entrance/exit data, but station location may be a unique consistent handle to grab all the entrances/exits associated with a station - For now, will call any station that has at least one ADA-accessible entrance/exit “accessible”, but we know that some stations are only partially accessible. Different train lines are serviced by different platforms within the same stations, and not all are fully accessible. This is where the elevator/escalator data joining may be useful for future analysis.

head(sub.location.raw.data$ada)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
sub.loc.ada.sum <- sub.location.raw.data %>% 
  group_by(station.location) %>%  # seems to be the best unique key for stations in the dataset
  mutate(ada.sum = sum(ada)) %>%  # which stations are ada accessible? 
  select(division:entrance.type, ada, station.location, ada.sum) %>% 
  unique()
sub.loc.ada.sum %>% 
  ungroup() %>% 
  arrange(desc(ada.sum)) %>% 
  select(division:station, entrance.type:ada, ada.sum)
## # A tibble: 585 x 6
##    division             line                        station entrance.type
##       <chr>            <chr>                          <chr>         <chr>
##  1      IND         6 Avenue 47-50th Sts Rockefeller Center      Easement
##  2      IND         6 Avenue 47-50th Sts Rockefeller Center      Elevator
##  3      IND         6 Avenue 47-50th Sts Rockefeller Center         Stair
##  4      IND         6 Avenue 47-50th Sts Rockefeller Center         Stair
##  5      IND         8 Avenue                        34th St      Elevator
##  6      IND         8 Avenue                        34th St         Stair
##  7      IND Queens Boulevard               Jamaica-179th St      Elevator
##  8      IND Queens Boulevard               Jamaica-179th St         Stair
##  9      IND         8 Avenue                        59th St      Easement
## 10      IND         8 Avenue                        59th St      Elevator
## # ... with 575 more rows, and 2 more variables: ada <lgl>, ada.sum <int>
sub.loc.ada.sum %>% 
  ungroup() %>% 
  arrange(desc(station)) %>% 
  select(division:station, entrance.type:ada, ada.sum)
## # A tibble: 585 x 6
##    division      line                 station entrance.type   ada ada.sum
##       <chr>     <chr>                   <chr>         <chr> <lgl>   <int>
##  1      IRT    Pelham               Zerega Av         Stair FALSE       0
##  2      IND  6 Avenue                 York St         Stair FALSE       0
##  3      IND Concourse Yankee Stadium-161st St      Elevator  TRUE       8
##  4      IND Concourse Yankee Stadium-161st St         Stair  TRUE       8
##  5      IND Concourse Yankee Stadium-161st St          Door  TRUE       8
##  6      IRT    Jerome Yankee Stadium-161st St      Elevator  TRUE       8
##  7      IRT    Jerome Yankee Stadium-161st St         Stair  TRUE       8
##  8      IND  8 Avenue      World Trade Center         Stair  TRUE       8
##  9      IRT  Flushing     Woodside Av-61st St     Escalator  TRUE       3
## 10      IRT  Flushing     Woodside Av-61st St         Stair  TRUE       3
## # ... with 575 more rows

Looking over the result, stations with only “Stair” entrances are not ada-accessible (ada.sum == 0), but stations with “Elevator” and “Escalator” entrances have more than one ada-accessible entrance (ada.sum > 0). Will call stations with ada.sum > 0 ADA-Accessible. The total value of the ada.sum probably doesn’t matter much on it’s own. Larger stations with more entrances will have a higher ada.sum score just because of the number of entrances.

sub.loc.ada.unique <- sub.loc.ada.sum %>% 
  select(division:route11, station.location:ada.sum) %>% 
  unique() %>% 
  ungroup() %>% 
  arrange(ada.sum, station, line) 
# Probably don't need to keep dragging the route information around, but who knows

sub.loc.ada.annot <- sub.loc.ada.unique %>% 
  filter(ada.sum == 0) %>% 
  mutate(ada.class = "not.ada") %>% 
  bind_rows(
    sub.loc.ada.unique %>% 
      filter(ada.sum > 0) %>% 
      mutate(ada.class = "ada.complnt")
    ) %>% 
  arrange(station)

sub.loc.ada.annot %>% 
  ggplot(aes(x = station.longitude, y = station.latitude, color = ada.class)) +
  geom_point(size = 3, alpha = 0.5) +
  coord_quickmap()

ggmap(nyc.map) +
  geom_point(
    data = sub.loc.ada.annot, 
    aes(x = station.longitude, y = station.latitude, color = ada.class),
    size = 3) +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data\nColor by Accessibility")
## Warning: Removed 1 rows containing missing values (geom_rect).

Now the next questions is how do I “sample” points from the area of NYC?

The NYC map shapefile might be useful?

Borough boundary shapefile from: http://www1.nyc.gov/site/planning/data-maps/open-data/districts-download-metadata.page

NYC city limits? (same source as above)

West Longitude: -74.257159 East Longitude: -73.699215 North Latitude: 40.915568 South Latitude: 40.495992

library(rgdal)
nyc.shapefile <- readOGR(dsn = "../data/nybb_17c/nybb.shp")
## OGR data source with driver: ESRI Shapefile 
## Source: "../data/nybb_17c/nybb.shp", layer: "nybb"
## with 5 features
## It has 4 fields
nyc.map.df <- fortify(nyc.shapefile)
nyc.shapefile.map <- ggplot() +
  geom_path(
    data = nyc.map.df, aes(x = long, y = lat, group = group), size = 0.2
    ) 

Grabbed the locations from the 2015 NYC 311 calls data - use as random sample?

nyc.311.loc <- read_csv(file = "../data/nyc_311_2015_locations.csv") %>% 
  filter(Borough == "BROOKLYN" | Borough == "BRONX" | Borough == "MANHATTAN" | Borough == "QUEENS") %>% 
  unique() %>%  # a lot of calls come from an identical lat/long
  filter(Longitude > -78.0)  # there's one odd point out
# too many points to just plot, but here's a density plot:

nyc.311.loc %>% 
  ggplot(aes(x = Longitude, y = Latitude)) +
  geom_bin2d(bins = 300) +
  scale_fill_viridis(option = "B") +
  coord_quickmap()

# can do a geom_point map with all points, but it takes a long time
# uncomment if you want to see:
#nyc.311.loc %>% 
#  ggplot(aes(x = Longitude, y = Latitude)) +
#  geom_point(alpha = 0.2, size = 0.25) +
#  coord_quickmap()

Not bad, can work.

test6 <- sub.loc.ada.annot %>% 
  mutate(station.id = paste(station, division, line, sep = " / ")) 
loc.test <- nyc.311.loc %>% 
  slice(1:10)


test6 %>% 
  group_by(station.id) %>% 
  count() %>% 
  filter(n > 1)
## # A tibble: 5 x 2
## # Groups:   station.id [5]
##                                        station.id     n
##                                             <chr> <int>
## 1 47-50th Sts Rockefeller Center / IND / 6 Avenue     2
## 2     Brooklyn Bridge-City Hall / IRT / Lexington     8
## 3    Canarsie - Rockaway Parkway / BMT / Canarsie     3
## 4                      Chambers St / BMT / Nassau     2
## 5   Forest Hills-71st Av / IND / Queens Boulevard     2
test7 <- test6 %>% 
  group_by(station.id) %>% 
  mutate(avg.lat = mean(station.latitude)) %>% 
  mutate(avg.long = mean(station.longitude))
test8 <- test7 %>% 
  select(station.id, division:station, avg.lat:avg.long, ada.class) %>% 
  unique()
nrow(test8)
## [1] 465
length(test8$station.id)
## [1] 465
t1 <- rbind(test8$station.id, test8$station.id)
t2 <- as.data.frame(rbind(t1, t1, t1, t1, t1), stringsAsFactors = FALSE)
colnames(t2) <- paste("stat", 1:ncol(t2), sep = "")

t3 <- t2 %>% select(1:5)

test9 <- bind_cols(loc.test, t3) 
test10 <- test9 %>%
  gather(key = "station.num", value = "station.id", 4:8) %>% 
  arrange(station.num)
test11 <- test10 %>% 
  inner_join(test8, by = "station.id")
test12 <- test11 %>% 
  mutate(dist.to.stat = distGeo(p1 = as.matrix(test11 %>% select(Longitude, Latitude)), p2 = as.matrix(test11 %>% select(avg.long, avg.lat))) * 0.000621371) %>% 
  mutate(loc.id = paste(Borough, Latitude, Longitude, sep = " / "))


test12 %>% 
  group_by(loc.id) %>% 
  summarize(nearest.stat = min(dist.to.stat)) %>% 
  ungroup() %>% 
  inner_join((test12 %>% select(Longitude, Latitude, loc.id)), by = "loc.id") %>% 
  unique() %>% 
  ggplot(aes(x = Longitude, y = Latitude, color = nearest.stat)) +
  geom_point(size = 3) +
  scale_color_viridis(option = "D")

To tackle - Separate distance calculations by borough for both the random point and the station, in case “closest” stations are across a river. Probably can keep Queens and Brooklyn together? -